# Import basic libraries
import matplotlib.pyplot as plt
import numpy as np # for linear algebra eg np.log, np.where, np.mean, np.std
import pandas as pd # for import data eg pd.read_csv, pd.DataFrame or pd.Series, pd.get_dummies(col_name,drop_first=True)
import seaborn as sns # mainly for visualisation
import plotly.express as px #plotly visualisation (aka ggplot2 in R)
from tabulate import tabulate #construct table
from datetime import datetime# if dealing with date and time or time series data
sns.set() # Optional:: this just make the plot nicer by changing the color, fontsize etc
#This syntax helps to display inline within frontends like the Jupyter notebook,
#directly below the code cell that produced it.
#The resulting plots will published nicedly within the notebook document.
%matplotlib inline
#display all float to 3dp
pd.set_option('display.float','{:.3f}'.format)
pd.set_option('display.max_columns',50)
pd.set_option('display.max_rows',100)
#Warning messages are typically issued in situations where it is useful to alert the user of some condition in a program,
#where that condition (normally) doesn’t warrant raising an exception and terminating the program.
#For example, one might want to issue a warning when a program uses an obsolete module.
from warnings import filterwarnings
filterwarnings('ignore')
# Imports relevant scikit-learn to this project
# tools for data management
import missingno as msno
from fancyimpute import KNN
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, learning_curve, KFold # import KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE, RFECV ##Recursive Feature Elimination with Cross Validation
from sklearn.pipeline import Pipeline, FeatureUnion
#for Decision Tree Visualisation
from sklearn import tree
#from sklearn.tree import export_graphviz
import pydotplus
from scipy import stats
# unsupervised learnings
from sklearn.cluster import KMeans
# supervised learnings
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import ElasticNet
from sklearn.kernel_ridge import KernelRidge
#from sklearn.svm import SVR
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool, cv
from keras.wrappers.scikit_learn import KerasClassifier
from mlxtend.regressor import StackingCVRegressor
from keras import models,layers
# Model accuracy measures
import sklearn.metrics as metrics
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error as MSE, r2_score, confusion_matrix
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import scikitplot as skplt #Lift Curve / Gain Chart
#Import files/data
file_path = '../PGDADS_Capstone Assignment/Office Supply Campaign ResultS 7-23-19.xlsx'
df = pd.read_excel(file_path) ##,index_col=0)
print('Data is loaded successfully!')
print('There are {} observations and {} features in the dataset:'.format(*df.shape))
print('\t Of which, {} numeric features'.format(df.select_dtypes(include='float64').shape[1]))
print('\t And, {} non-numeric/object features.'.format(df.select_dtypes(include='object').shape[1]))
print('\t And, {} non-numeric/non-object features.'.format(df.select_dtypes(exclude=['float64','object']).shape[1]))
df.info()
df.rename(columns=lambda col: col.replace(' ', '_'), inplace=True)
df.info()
df.info()
print(f"Unique values in each non-numeric column:\t")
for col in df.select_dtypes(include=[object]):
print(col," :\n", df[col].unique())
print("Unique values in each non-numeric column:\t")
for col in df.select_dtypes(include=[object]):
print(col,":", df[col].nunique())
df_updated = df[df['Desk'] != 911].copy()
df_updated.info()
print('There are {} observations and {} features in the dataset:'.format(*df_updated.shape))
print('\t Of which, {} numeric features'.format(df_updated.select_dtypes(include='float64').shape[1]))
print('\t And, {} non-numeric/object features.'.format(df_updated.select_dtypes(include='object').shape[1]))
print('\t And, {} non-numeric/non-object features.'.format(df_updated.select_dtypes(exclude=['float64','object']).shape[1]))
print('\t Removed {} feature(s).'.format(abs(df_updated.shape[1] - df.shape[1])))
print('\t Removed {} observation(s) (i.e. Desk = 911).'.format(abs(df_updated.shape[0] - df.shape[0])))
neg_bool = ((df_updated['Campaign_Period_Sales'] < 0) | (df_updated['Historical_Sales_Volume'] < 0))
print(f"Number of negative values:\t{sum(neg_bool)}")
display(df_updated.loc[neg_bool, :])
df_vers = df_updated.copy()
df_vers = df_vers[~df_vers['Customer_Number'].isin(df_updated['Customer_Number'][neg_bool])]
df_vers.info()
print('There are {} observations and {} features in the dataset:'.format(*df_vers.shape))
print('\t Of which, {} numeric features'.format(df_vers.select_dtypes(include='float64').shape[1]))
print('\t And, {} non-numeric/object features.'.format(df_vers.select_dtypes(include='object').shape[1]))
print('\t And, {} non-numeric/non-object features.'.format(df_vers.select_dtypes(exclude=['float64','object']).shape[1]))
print('\t Removed {} feature(s).'.format(abs(df_vers.shape[1] - df_updated.shape[1])))
print('\t Removed {} observation(s) (i.e. Negative Sales Values).'.format(abs(df_vers.shape[0] - df_updated.shape[0])))
df_vers['Desk'] = df_vers['Desk'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Executive_Chair'] = df_vers['Executive_Chair'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Standard_Chair'] = df_vers['Standard_Chair'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Monitor'] = df_vers['Monitor'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Printer'] = df_vers['Printer'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Computer'] = df_vers['Computer'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Insurance'] = df_vers['Insurance'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Toner'] = df_vers['Toner'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Office_Supplies'] = df_vers['Office_Supplies'].map({'YY':1.,'Y':1.,'N':0.})
df_vers['Number_of_Employees'] = np.where(df_vers['Number_of_Employees']== ' ',np.nan,df_vers['Number_of_Employees'])
df_vers1 = df_vers.copy()
df_vers1.info()
print('There are {} observations and {} features in the dataset:'.format(*df_vers.shape))
print('\t Of which, {} numeric features'.format(df_vers.select_dtypes(include='float64').shape[1]))
print('\t And, {} non-numeric/object features.'.format(df_vers.select_dtypes(include='object').shape[1]))
print('\t And, {} non-numeric/non-object features.'.format(df_vers.select_dtypes(exclude=['float64','object']).shape[1]))
print('\t Added {} feature(s).'.format(abs(df_vers.shape[1] - df_vers1.shape[1])))
print('\t Removed {} observation(s).'.format(abs(df_vers1.shape[0] - df_vers.shape[0])))
print("Unique values in each non-numeric column:\t")
for col in df_vers1.select_dtypes(include=[object]):
print(col,":", df_vers1[col].unique())
print("Unique values in each numeric column:\t")
for col in df_vers1.select_dtypes(exclude=[object]):
print(col,":", df_vers1[col].nunique())
df_vers2 = df_vers1.copy()
total_obs = df_vers2.isnull().count()
calculate_missing = df_vers2.isnull().sum()
total_missing = calculate_missing[calculate_missing.values>0].sort_values(ascending=False)
perc_missing = round(total_missing/total_obs*100,2)
perc_missing = perc_missing[perc_missing.values>0].sort_values(ascending=False)
col_missing_data = pd.concat([total_missing,perc_missing],axis=1,keys=['#','%']).sort_values('#',ascending=False)
summary_table = tabulate(col_missing_data, headers=["Features", "No of Obs Missing #", "No of Obs Missing %"])
print('There are {} columns with missing data.'.format(col_missing_data.shape[0]))
print('These columns are as follows:\n')
print(summary_table)
apply_mode = df_vers2[total_missing.index].mode()
apply_mode
for col in apply_mode[['Toner','Insurance','Printer','Monitor','Standard_Chair','Executive_Chair']]:
df_vers2[col] = df_vers2[col].fillna(apply_mode[col][0])
df_vers2.info()
def encode(data):
df_enc = df.copy()
encoder = OrdinalEncoder()
#retains only non-null values
nonulls = np.array(data.dropna())
#reshapes the data for encoding
impute_reshape = nonulls.reshape(-1,1)
#encode date
impute_ordinal = encoder.fit_transform(impute_reshape)
#Assign back encoded values to non-null values
data.loc[data.notnull()] = np.squeeze(impute_ordinal)
return data
df_vers3 = df_vers2.copy()
df_vers3.head()
df_vers3.select_dtypes(include='object').columns
for col in df_vers3.select_dtypes(include='object').drop('Repurchase_Method',axis=1).columns:
encode(df_vers3[col])
df_vers3.head()
im_br = IterativeImputer(random_state=24, estimator=ExtraTreesClassifier(n_estimators=10))
cat_df_imp = pd.DataFrame(im_br.fit_transform(df_vers3.select_dtypes(include='object').drop('Repurchase_Method',axis=1)),
index=df_vers3.index,
columns=df_vers3.select_dtypes(include='object').drop('Repurchase_Method',axis=1).columns)
cat_df_imp.head(10)
df_vers3.select_dtypes(include='object').drop('Repurchase_Method',axis=1).head(10)
print("Unique values in each non-numeric column:\t")
for col in df_vers3.select_dtypes(include=[object]).drop('Repurchase_Method',axis=1):
print(col,":", df_vers3[col].unique())
print("Unique values in each non-numeric column:\t")
for col in cat_df_imp:
print(col,":", cat_df_imp[col].unique())
list_of_cats = []
for col in df_vers2.drop('Repurchase_Method',axis=1).dropna().select_dtypes(include='object').columns:
uniq_values = df_vers2.drop('Repurchase_Method',axis=1).dropna().select_dtypes(include='object')[col].unique().tolist()
try:
nan_idx = uniq_values.index(np.nan)
uniq_values.pop(nan_idx)
list_of_cats.append(uniq_values)
except ValueError as ve:
list_of_cats.append(uniq_values)
list_of_cats
def decode_cat_encodings(df_encoded):
df = pd.DataFrame(df_encoded, columns=df_vers3.drop('Repurchase_Method',axis=1).select_dtypes(include='object').columns)
for col_idx, col in enumerate(df_vers3.drop('Repurchase_Method',axis=1).select_dtypes(include='object').columns):
uniq_lst_enc = df[col].unique().tolist()
level_mapper = {num: cat for num, cat in zip(uniq_lst_enc, list_of_cats[col_idx])}
df[col] = df[col].map(level_mapper)
return df
imputed_col = decode_cat_encodings(cat_df_imp)
imputed_col.isnull().sum()
df_vers4 = pd.concat([df_vers3.drop(['Last_Transaction_Channel','Number_of_Employees','Language'],axis=1), imputed_col],axis = 1)
df_vers4.info()
df_vers4
#Recheck if all missing data are imputed correctly.
total_obs = df_vers4.isnull().count()
calculate_missing = df_vers4.isnull().sum()
total_missing = calculate_missing[calculate_missing.values>0].sort_values(ascending=False)
perc_missing = round(total_missing/total_obs*100,2)
perc_missing = perc_missing[perc_missing.values>0].sort_values(ascending=False)
col_missing_data = pd.concat([total_missing,perc_missing],axis=1,keys=['#','%']).sort_values('#',ascending=False)
summary_table = tabulate(col_missing_data, headers=["Features", "No of Obs Missing #", "No of Obs Missing %"])
print('There are {} columns with missing data.'.format(col_missing_data.shape[0]))
print('These columns are as follows:\n')
print(summary_table)
print('\n')
print('Since there are {} columns with missing data, \
which means we have succesfully imputed with mode accordingly.'.format(col_missing_data.shape[0]))
df_vers4.shape
df_vers4['Response']=np.where(df_vers4['Campaign_Period_Sales']<=0,0.,1.)
df_vers4['TenureYrs']=((datetime.now().year - pd.Series(df_vers4["Date_of_First_Purchase"]).apply(lambda x: x.year)) * 12 + \
(datetime.now().month-pd.Series(df_vers4["Date_of_First_Purchase"]).apply(lambda x: x.month)))/12
df_vers4['ProductMix'] = df_vers4['Desk']+df_vers4['Executive_Chair']+df_vers4['Standard_Chair']+\
df_vers4['Monitor']+df_vers4['Printer']+df_vers4['Computer']+\
df_vers4['Insurance']+df_vers4['Toner']+df_vers4['Office_Supplies']
df_vers4['Last_Transaction_Channel'].unique()
df_vers4['Contact_Channel'] = np.where(df_vers4['Last_Transaction_Channel']=='AUTO RENEW','eDM',\
np.where(df_vers4['Last_Transaction_Channel']=='BILLING','eDM',\
np.where(df_vers4['Last_Transaction_Channel']=='MAIL','DM',\
np.where(df_vers4['Last_Transaction_Channel']=='PHONE','Telemarket',\
np.where(df_vers4['Last_Transaction_Channel']=='WEB','Digital',\
np.where(df_vers4['Last_Transaction_Channel']=='None','All',\
np.where(df_vers4['Last_Transaction_Channel']=='BRANCH (POS)','DM',\
np.where(df_vers4['Last_Transaction_Channel']=='BRANCH (PHONE)','Telemarket','Digital'))))))))
df_vers4.info()
df_vers4.shape
print("Unique values in each non-numeric column:\t")
for col in df_vers4.select_dtypes(include=[object]):
print(col,":", df_vers4[col].unique())
new_df = df_vers4.copy()
corr = new_df.drop('Customer_Number',axis=1).corr()
plt.figure(figsize=(12,12))
sns.heatmap(corr, annot=True, cmap='coolwarm')
sns.despine()
def cramers_v(x, y):
from scipy.stats import chi2_contingency
confusion_matrix = pd.crosstab(x,y)
chi2 = chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return round(np.sqrt(phi2corr/min((kcorr-1),(rcorr-1))),4)
new_df.columns
print("Unique values in each non-numeric column:\t")
for col in new_df.select_dtypes(include=[object]):
print(col,":", new_df[col].unique())
group1 = ['Monitor','Printer','Computer','Standard_Chair','ProductMix']
group2 = ['Do_Not_Direct_Mail_Solicit', 'Do_Not_Email', 'Do_Not_Telemarket']
group3 = ['Repurchase_Method', 'Last_Transaction_Channel']
X_PCA1 = pd.DataFrame(new_df, columns=group1)
X_PCA2 = pd.DataFrame(new_df, columns=group2)
X_PCA3 = pd.DataFrame(new_df, columns=group3)
y_PCA1 = pd.Series(new_df['Response'], name='label')
new_df[['Desk','Executive_Chair','Standard_Chair','Monitor','Printer','Computer','Insurance','Toner','Office_Supplies','ProductMix']].corr()
#f,axes = plt.subplots(1, 1, figsize=(15,10))
pca1 = PCA(n_components=5).fit(X_PCA1)
#plt.figure(figsize=(15,10))
plt.plot(pca1.explained_variance_)
plt.title('Monitor / Printer / Computer / Std_Chair / Prod_Mix')
plt.suptitle('Principal Component Analysis Scree Plot - Group 1')
plt.xlabel('# Components')
plt.ylabel('Explained Variance %')
plt.grid()
new_df[group1].corr()
feature1 = pd.DataFrame(new_df[group1])
from pandas.plotting import scatter_matrix
scatter_matrix(feature1,alpha=0.2,figsize=(15,15),diagonal='kde')
sns.despine()
feature_scaled1 = feature1/feature1.std()
pca_scaled1 =PCA(n_components=1).fit_transform(feature_scaled1)
components_scaled1 = PCA(n_components=1).fit(feature_scaled1).components_
print('Dimension of projected feature: {}'.format(pca_scaled1.shape[1]))
print('\nPCA Components:\n1st{}'.format(np.round(components_scaled1[0],4)))
#,np.round(components_scaled1[1],4)))
display(pd.DataFrame(abs(components_scaled1),\
index=['PCA_1'], \
columns=[group1]))
response_rows = new_df['Response'] == 1
not_response_rows = new_df['Response'] == 0
#plt.scatter((pca_scaled1)[response_rows,0]),(pca_scaled1)[response_rows,1])
#plt.scatter((pca_scaled1)[not_response_rows,0]),(pca_scaled1)[not_response_rows,1])
#plt.xlabel('Principal Component 1')
#plt.ylabel('Principal Component 2')
#plt.suptitle('PCA Scaled: Customers by Response')
#plt.title('PCA #1 Explained 55% & PCA #2 Explained 45%')
#plt.legend(['Response','Not Response'])
#plt.show()
exp_var_ratio1 = abs(components_scaled1[0])/np.sum(abs(components_scaled1[0]))
#exp_var_ratio2 = abs(components_scaled1[1])/np.sum(abs(components_scaled1[1]))
print(np.round(exp_var_ratio1,3))
#print(np.round(exp_var_ratio2,4))
#exp_var_ratio1 = np.vstack([exp_var_ratio1, exp_var_ratio2])
display(pd.DataFrame(exp_var_ratio1[np.newaxis],\
index=['PCA_1'], \
columns=[group1]))
#print('PCA_1: {:.2f}'.format(np.sum(abs(components_scaled1[0]))/(np.sum(abs(components_scaled1[0]))+np.sum(abs(components_scaled1[1])))))
#print('PCA_2: {:.2f}'.format(np.sum(abs(components_scaled1[1]))/(np.sum(abs(components_scaled1[0]))+np.sum(abs(components_scaled1[1])))))
new_df[['Do_Not_Direct_Mail_Solicit','Do_Not_Email','Do_Not_Telemarket']].corr()
print('Cramers V between Do_Not_Direct_Mail_Solicit and Do_Not_Telemarket : {}'.format(cramers_v(new_df.Do_Not_Telemarket, new_df.Do_Not_Direct_Mail_Solicit)))
print('Cramers V between Do_Not_Direct_Mail_Solicit and Do_Not_Email : {}'.format(cramers_v(new_df.Do_Not_Email, new_df.Do_Not_Direct_Mail_Solicit)))
print('Cramers V between Do_Not_Telemarket and Do_Not_Email : {}'.format(cramers_v(new_df.Do_Not_Email, new_df.Do_Not_Telemarket)))
#f,axes = plt.subplots(1, 1, figsize=(15,10))
pca2 = PCA(n_components=3).fit(X_PCA2)
#plt.figure(figsize=(15,10))
plt.plot(pca2.explained_variance_)
plt.title('Do Not Email / Do Not Mail Solicit / Do Not Telemarket')
plt.suptitle('Principal Component Analysis Scree Plot - Group 2')
plt.xlabel('# Components')
plt.ylabel('Explained Variance %')
plt.grid()
new_df[group2].corr()
feature2 = pd.DataFrame(new_df[group2])
feature_scaled2 = feature2/feature2.std()
pca_scaled2 =PCA(n_components=2).fit_transform(feature_scaled2)
components_scaled2 = PCA(n_components=2).fit(feature_scaled2).components_
print('Dimension of projected feature: {}'.format(pca_scaled2.shape[1]))
print('\nPCA Components:\n1st{}\n 2nd{}'.format(np.round(components_scaled2[0],4),np.round(components_scaled2[1],4)))
display(pd.DataFrame(abs(components_scaled2),\
index=['PCA_1','PCA_2'], \
columns=[group2]))
plt.scatter(abs(pca_scaled2)[response_rows,0],abs(pca_scaled2)[response_rows,1])
plt.scatter(abs(pca_scaled2)[not_response_rows,0],abs(pca_scaled2)[not_response_rows,1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scaled: Customers by Response')
plt.legend(['Response','Not Response'])
plt.show()
exp_var_ratio3 = abs(components_scaled2[0])/np.sum(abs(components_scaled2[0]))
exp_var_ratio4 = abs(components_scaled2[1])/np.sum(abs(components_scaled2[1]))
print(np.round(exp_var_ratio3,4))
print(np.round(exp_var_ratio4,4))
exp_var_ratio3 = np.vstack([exp_var_ratio3, exp_var_ratio4])
display(pd.DataFrame(abs(exp_var_ratio3),\
index=['PCA_1','PCA_2'], \
columns=[group2]))
print('PCA_1: {:.2f}'.format(np.sum(abs(components_scaled2[0]))/(np.sum(abs(components_scaled2[0]))+np.sum(abs(components_scaled2[1])))))
print('PCA_2: {:.2f}'.format(np.sum(abs(components_scaled2[1]))/(np.sum(abs(components_scaled2[0]))+np.sum(abs(components_scaled2[1])))))
print(new_df.groupby(['Repurchase_Method','Last_Transaction_Channel']).size())
print('Cramers V between RepurchMth and LastTranChannel: {}'.format(cramers_v(new_df.Repurchase_Method, new_df.Last_Transaction_Channel)))
print('Cramers V between CtcChannel and LastTranChannel: {}'.format(cramers_v(new_df.Contact_Channel, new_df.Last_Transaction_Channel)))
#Monitor Printer Computer Office_Supplies Standard_Chair ProdMix
#principalDf1 = pd.DataFrame(data=pca_scaled1,columns=['PCA_1_Prt_Mon_Com_Chr','PCA_2_OffSupplies'])
principalDf1 = pd.DataFrame(data=pca_scaled1,columns=['PCA_1_Prt_Mon_Com_Chr'])
principalDf1
#Do_Not_Direct_Mail_Solicit Do_Not_Email Do_Not_Telemarket
principalDf2 = pd.DataFrame(data=pca_scaled2,columns=['PCA_1_DM_Tele','PCA_2_Email'])
principalDf2
final_df = pd.concat([new_df.reset_index(), principalDf1],axis = 1)
final_df = pd.concat([final_df, principalDf2],axis = 1)
final_df.info()
print("Unique values in each non-numeric column:\t")
for col in final_df.select_dtypes(include=[object]):
print(col,":", final_df[col].unique())
print('There are {} observations and {} features in the dataset:'.format(*final_df.shape))
print('\t Of which, {} numeric features'.format(final_df.select_dtypes(include=['int64','float64']).shape[1]))
print('\t And, {} non-numeric/object features.'.format(final_df.select_dtypes(include='object').shape[1]))
print('\t And, {} non-numeric/non-object features.'.format(final_df.select_dtypes(exclude=['int64','float64','object']).shape[1]))
print('\t Add {} new feature(s).'.format(abs(new_df.shape[1] - final_df.shape[1])))
print('\t Removed {} observation(s).'.format(abs(df_vers4.shape[0] - final_df.shape[0])))
list_combined = ['Monitor','Printer','Computer','Standard_Chair','ProductMix',\
'Do_Not_Direct_Mail_Solicit', 'Do_Not_Email', 'Do_Not_Telemarket',\
'Repurchase_Method', 'Contact_Channel']
corr1 = final_df.drop(['index','Customer_Number','Campaign_Period_Sales'],axis=1).drop(list_combined,axis=1).corr()
plt.figure(figsize=(15,15))
sns.heatmap(corr1, annot=True, cmap='coolwarm')
sns.despine()
df_vers5 = final_df.copy()
df_vers5.describe()
f = df_vers5[['Historical_Sales_Volume']]
z1 = np.abs(stats.zscore(f))
print(z1)
g = df_vers5[['Number_of_Prior_Year_Transactions']]
z2 = np.abs(stats.zscore(g))
print(z2)
df_z1_score = df_vers5.copy()
df_z1_score = f[(z1 < 3).all(axis=1)]
df_z1_score.info()
print('Total Observations To Be Removed: {}'.format(df_vers2.shape[0]-df_z1_score.shape[0]))
df_z2_score = df_vers5.copy()
df_z2_score =g[(z2 < 3).all(axis=1)]
df_z2_score.info()
print('Total Observations To Be Removed: {}'.format(df_vers2.shape[0]-df_z2_score.shape[0]))
clean_df1 = df_vers5[df_vers5.index.isin(df_z1_score.index)]
#clean_df2 = clean_df1[clean_df1.index.isin(df_z2_score.index)]
clean_df2 = clean_df1[clean_df1['Number_of_Prior_Year_Transactions']<300]
print('Total outliers removed from Historical_Sales_Volume : {}'.format(abs(df_vers5.shape[0]-clean_df1.shape[0])))
print('Total outliers removed from Number_of_Prior_Year_Transactions : {}'.format(abs(clean_df1.shape[0]-clean_df2.shape[0])))
print('In summary, total outliers removed : {}'.format(abs(df_vers4.shape[0]-clean_df2.shape[0])))
print('From original {} observations to {}.'.format(df_vers4.shape[0],clean_df2.shape[0]))
clean_df2.info()
print('After removing outliers, there are now {} observations and {} features.'.format(*clean_df2.shape))
print('In summary, we have removed {} observations due to outliers.'.format(df_vers2.shape[0]-clean_df2.shape[0]))
print('Which means, we still have {} of the data retained, which is still reasonable.'.format(str(round(len(clean_df2)/len(df_vers2)*100,2))+str('%')))
final_df_new = final_df.copy()
final_df_new.describe()
final_df_new.select_dtypes(include=['int64','float64']).columns
final_df_new['HistSalesVolBand'] = np.where(final_df_new['Historical_Sales_Volume'] >= 1500000,'01.MT1500M',\
np.where(final_df_new['Historical_Sales_Volume'] >= 800000,'02.800KTO1500M',\
np.where(final_df_new['Historical_Sales_Volume'] >= 500000,'03.500KTO800K',\
np.where(final_df_new['Historical_Sales_Volume'] >= 300000,'04.300KTO500K',\
np.where(final_df_new['Historical_Sales_Volume'] >= 150000,'05.150KTO300K',\
np.where(final_df_new['Historical_Sales_Volume'] >= 50000,'06.50KTO150K','07.LT50K'))))))
final_df_new.groupby(['HistSalesVolBand', 'Response']).size()
final_df_new.groupby(['HistSalesVolBand', 'Response']).size().unstack(level=1).plot(kind='bar')
sns.despine(left=True, right=True)
#final_df_new['Historical_Sales_Bin'] = pd.qcut(final_df_new['Historical_Sales_Volume'], 8)
#final_df_new.groupby(['Historical_Sales_Bin', 'Response']).size()
#final_df_new.groupby(['Historical_Sales_Bin', 'Response']).size().unstack(level=1).plot(kind='bar')
#sns.despine(left=True, right=True)
final_df_new['PriorYrTranBand'] = np.where(final_df_new['Number_of_Prior_Year_Transactions'] >= 40,'01.MTE40',\
np.where(final_df_new['Number_of_Prior_Year_Transactions'] >= 30,'02.30TO39',\
np.where(final_df_new['Number_of_Prior_Year_Transactions'] >= 25,'03.25TO29',\
np.where(final_df_new['Number_of_Prior_Year_Transactions'] >= 20,'04.20TO24',\
np.where(final_df_new['Number_of_Prior_Year_Transactions'] >= 15,'05.15TO19',\
np.where(final_df_new['Number_of_Prior_Year_Transactions'] >= 5,'06.5TO14','07.LT5'))))))
final_df_new.groupby(['PriorYrTranBand', 'Response']).size()
final_df_new.groupby(['PriorYrTranBand', 'Response']).size().unstack(level=1).plot(kind='bar')
sns.despine(left=True, right=True)
#final_df_new['Num_of_Prior_Trans_Bin'] = pd.qcut(final_df_new['Number_of_Prior_Year_Transactions'], 8)
#final_df_new.groupby(['Num_of_Prior_Trans_Bin', 'Response']).size()
#final_df_new.groupby(['Num_of_Prior_Trans_Bin', 'Response']).size().unstack(level=1).plot(kind='bar')
#sns.despine(left=True, right=True)
final_df_new['Tenure_Band'] = np.where(final_df_new['TenureYrs'] >= 50,'01.MTE50 Yrs',\
np.where(final_df_new['TenureYrs'] >= 40,'02.40TO50 Yrs',\
np.where(final_df_new['TenureYrs'] >= 30,'03.30TO40 Yrs',\
np.where(final_df_new['TenureYrs'] >= 25,'04.25TO30 Yrs',\
np.where(final_df_new['TenureYrs'] >= 20,'05.20TO25 Yrs',\
np.where(final_df_new['TenureYrs'] >= 10,'06.10TO20 Yrs',\
np.where(final_df_new['TenureYrs'] >= 5,'07.5TO10 Yrs','08.LT5 Yrs')))))))
final_df_new.groupby(['Tenure_Band', 'Response']).size()
final_df_new.groupby(['Tenure_Band', 'Response']).size().unstack(level=1).plot(kind='bar')
sns.despine(left=True, right=True)
#final_df_new['Tenure_Bin'] = pd.qcut(final_df_new['TenureYrs'], 8)
#final_df_new.groupby(['Tenure_Bin', 'Response']).size()
#final_df_new.groupby(['Tenure_Bin', 'Response']).size().unstack(level=1).plot(kind='bar')
#sns.despine(left=True, right=True)
print('Option 1: Remove Outliers: {} observations with {} features'.format(clean_df2.shape[0],clean_df2.shape[1]))
print('Option 2: Binning : {} observations with {} features'.format(final_df_new.shape[0],final_df_new.shape[1]))
# Option 1
model_df = clean_df2.copy()
print('Original Dataset after removed outliers : {} observations with {} features.'.format(model_df.shape[0],model_df.shape[1]),'\n')
list_to_remove = ['Contact_Channel','Repurchase_Method','index','Do_Not_Direct_Mail_Solicit','Do_Not_Email','Do_Not_Telemarket','Response','Date_of_First_Purchase','Customer_Number','Campaign_Period_Sales','Standard_Chair','Monitor','Printer','Computer','ProductMix']
print('Model Dataset if using Option 1:')
print(model_df.drop(list_to_remove, axis=1).info())
# Option 2
model_df = final_df_new.copy()
print('Original Dataset if using binnings : {} observations with {} features.'.format(model_df.shape[0],model_df.shape[1]),'\n')
list_to_remove = ['Historical_Sales_Volume','Number_of_Prior_Year_Transactions','TenureYrs','Contact_Channel','Repurchase_Method','index','Do_Not_Direct_Mail_Solicit','Do_Not_Email','Do_Not_Telemarket','Response','Date_of_First_Purchase','Customer_Number','Campaign_Period_Sales','Standard_Chair','Monitor','Printer','Computer','ProductMix']
print('Model Dataset if using Option 2:')
print(model_df.drop(list_to_remove, axis=1).info())
# Option 3
model_df = clean_df2[['Response','Historical_Sales_Volume','Last_Transaction_Channel','Number_of_Prior_Year_Transactions','Toner','Executive_Chair','Insurance','Desk','Office_Supplies','PCA_1_Prt_Mon_Com_Chr','PCA_1_DM_Tele','PCA_2_Email']].copy()
print('Dataset with KSelect Features : {} observations with {} features.'.format(model_df.shape[0],model_df.shape[1]),'\n')
print('Model Dataset if using Option 3:')
print(model_df.info())
for col in model_df.drop(list_to_remove, axis=1).select_dtypes(include=[object]):
print(col,":\n",model_df[col].unique())
for col in model_df.drop(list_to_remove, axis=1).select_dtypes(include=[object]):
print(col,":",model_df[col].nunique())
So far, Option 1 provides the best outcome.
X = model_df.drop(list_to_remove, axis=1)
#X['Avg_Sales_PU'] = X['Historical_Sales_Volume']/X['Number_of_Prior_Year_Transactions']
#X = model_df.drop('Response',axis=1)
y = model_df['Response']
# Apply train, test split
print(X.shape, y.shape)
print('So in X dataframe, there are {} independent variables and in Y dataframe, only 1 target variable.'.format(X.shape[1], 1))
print('Also there are {} number of observations.'.format(X.shape[0]))
X.select_dtypes(exclude='object').info()
X.select_dtypes(include='object').info()
# One-hot encode the data using pandas get_dummies
X_onehot = pd.get_dummies(X)
X_onehot
X_train, X_validation, y_train, y_validation = train_test_split(X_onehot, y, stratify=y, test_size = 0.50, random_state = 42)
print('OneHot_Encoded Training examples: {}'.format(X_train.shape[0]))
print('So in X_train dataframe, there are {} independent variables and only 1 target variable.'.format(X_train.shape[1],+1))
print('Also there are {} number of observations.'.format(X_train.shape[0]))
print('==' * 50)
print('OneHot_Encoded Validation examples: {}'.format(X_validation.shape[0]))
print('So in X_validation dataframe, there are {} independent variables and only 1 target variable.'.format(X_validation.shape[1],+1))
print('Also there are {} number of observations.'.format(X_validation.shape[0]))
def score(confusion_matrix):
TP = confusion_matrix[0,0]
FP = confusion_matrix[0,1]
FN = confusion_matrix[1,0]
TN = confusion_matrix[1,1]
accuracy_score = (TP+TN)/(TN+TP+FP+FN)
recall_score = TP/(TP+FN)
precision = TP/(TP+FP)
auc = (recall_score + precision)/2
F1=(2*precision*recall_score)/(precision+recall_score)
return accuracy_score, auc, recall_score, precision, F1
print('Linear | Logistic Regression:')
logit = LogisticRegression(random_state=24)
# Construct the hyperparameter grid
param_grid = {'penalty':['l1', 'l2', 'elasticnet', 'none'], 'C':np.logspace(0, 4, 10),
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'l1_ratio' : [0,1,None], 'fit_intercept': [True,False]
}
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_logit = GridSearchCV(logit, param_grid=param_grid, cv=5)
grid_logit.fit(X_train, y_train)
y_pred_logit = grid_logit.predict(X_validation)
cm_logit_train = confusion_matrix(y_train, grid_logit.predict(X_train), labels=None, sample_weight=None)
print('\nAccuracy Measures on Train:')
print('Confusion_matrix : \n{}'.format(cm_logit_train))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_train, grid_logit.predict(X_train))))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_train, grid_logit.predict(X_train))))
print('Classification Report:')
print(classification_report(y_train, grid_logit.predict(X_train)))
cm_logit = confusion_matrix(y_validation, y_pred_logit, labels=None, sample_weight=None)
print('\nAccuracy Measures on Test:')
print('Confusion_matrix : \n{}'.format(cm_logit))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_validation, y_pred_logit)))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_validation, y_pred_logit)))
print('Classification Report:')
print(classification_report(y_validation, y_pred_logit))
print('\n')
print('Optimal hyperparameter(s): {}.'.format(dict(grid_logit.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_logit.best_estimator_))
print('Linear | Logistic Regression:')
#Ridge
cls = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
intercept_scaling=1, l1_ratio=0, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l2',
random_state=24, solver='newton-cg', tol=0.0001, verbose=0,
warm_start=False)
cls.fit(X_train, y_train)
y_pred_cls = cls.predict(X_validation)
cm_logit_t = confusion_matrix(y_train, cls.predict(X_train), labels=None, sample_weight=None)
cm_logit_test = confusion_matrix(y_validation, y_pred_cls, labels=None, sample_weight=None)
print('Tree | Decision Tree Classification:')
dec_tree = DecisionTreeClassifier(random_state=24)
# Construct the hyperparameter grid
param_grid = {'criterion': ["gini"],#,"mse"],
'min_samples_split': [2, 3, 4],
'max_depth': [4],#list(range(2, 21, 2)),
'min_samples_leaf': [1, 2, 3],
#'max_leaf_nodes': [5, 20, 100],
}
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_dec = GridSearchCV(dec_tree, param_grid=param_grid, cv=5)
grid_dec.fit(X_train, y_train)
y_pred_dec_tree = grid_dec.predict(X_validation)
cm_dt_train = confusion_matrix(y_train, grid_dec.predict(X_train), labels=None, sample_weight=None)
cm_dt_test = confusion_matrix(y_validation, y_pred_dec_tree, labels=None, sample_weight=None)
X_encode = X.copy()
for col in X_encode.select_dtypes(include='object').columns:
encode(X_encode[col])
X.select_dtypes(include='object').columns
def decode(df_encoded):
df = pd.DataFrame(df_encoded, columns=X.select_dtypes(include='object').columns)
for col_idx, col in enumerate(X[['Last_Transaction_Channel', 'Number_of_Employees', 'Language']].columns):
uniq_lst_enc = df[col].unique().tolist()
level_mapper = {num: cat for num, cat in zip(uniq_lst_enc, list_of_cats[col_idx])}
df[col] = df[col].map(level_mapper)
return df
X_train_enc, X_validation_enc, y_train_enc, y_validation = train_test_split(X_encode, y, test_size = 0.50, random_state = 24)
print('Label_Encoded Training examples: {}'.format(X_train_enc.shape[0]))
print('So in X_train dataframe, there are {} independent variables and only 1 target variable.'.format(X_train_enc.shape[1],+1))
print('Also there are {} number of observations.'.format(X_train_enc.shape[0]))
print('==' * 50)
print('Label_Encoded Validation examples: {}'.format(X_validation_enc.shape[0]))
print('So in X_validation dataframe, there are {} independent variables and only 1 target variable.'.format(X_validation_enc.shape[1],+1))
print('Also there are {} number of observations.'.format(X_validation_enc.shape[0]))
print('Ensemble | Random Forest Classification:')
rf_clf = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=7, max_features=10,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=2000,
n_jobs=None, oob_score=True, random_state=24, verbose=0,
warm_start=False)
#rf_clf = RandomForestClassifier(random_state=24)
# Construct the hyperparameter grid
#param_grid = {#'n_estimators':[2000, 3000],
#'max_features':[5,10,15],
#'max_depth' :[3,5,7,8],
#'criterion' :['gini', 'mse', 'entropy'],
#'oob_score' :[True]
# }
param_grid = {'max_features': range(5,31,5), 'oob_score' :[True, False]}
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_rf = GridSearchCV(rf_clf, param_grid=param_grid, cv=5)
grid_rf.fit(X_train_enc, y_train_enc)
y_pred_rf = grid_rf.predict(X_validation_enc)
cm_rf_train = confusion_matrix(y_train_enc, grid_rf.predict(X_train_enc), labels=None, sample_weight=None)
cm_rf_test = confusion_matrix(y_validation_enc, y_pred_rf, labels=None, sample_weight=None)
print('Ensemble | GradientBoost-Tree Classification:')
gbm = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.01, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=3000,
n_iter_no_change=None, presort='deprecated',
random_state=24, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
#xgb = XGBClassifier(random_state=24)
# Construct the hyperparameter grid
#param_grid = {'n_estimators': [500, 1000, 2000, 3000, 4000],
# 'max_depth' : [3,4,5, 6, 7, 8],
# 'learning_rate' : [0.01,0.02,0.05,0.10]
# 'obj': ['binary:logistic'] #['reg:linear','reg:squarederror']
# }
param_grid = {
#'n_estimators': [3000],
#'max_depth' : [3,8],
#'learning_rate' : [0.01,0.02]
}
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_gbm = GridSearchCV(gbm, param_grid=param_grid, cv=5)
grid_gbm.fit(X_train_enc, y_train_enc)
y_pred_gbm = grid_gbm.predict(X_validation_enc)
cm_gbm_train = confusion_matrix(y_train_enc, grid_gbm.predict(X_train_enc), labels=None, sample_weight=None)
cm_gbm_test = confusion_matrix(y_validation_enc, y_pred_gbm, labels=None, sample_weight=None)
print('Ensemble | XGBoost Classification:\n')
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.01, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=3000, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=24,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)
# Construct the hyperparameter grid
#param_grid = {'n_estimators': [500, 1000, 2000, 3000, 4000],
# 'max_depth' : [3,4,5, 6, 7, 8],
# 'learning_rate' : [0.01,0.02,0.05,0.10]
# 'obj': ['binary:logistic'] #['reg:linear','reg:squarederror']
# }
param_grid = {
#'n_estimators': [3000],
#'max_depth' : [3,8],
#'learning_rate' : [0.01,0.02]
}
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 10-fold cross-validation
grid_xgb = GridSearchCV(xgb, param_grid=param_grid, cv=10)
grid_xgb.fit(X_train_enc, y_train_enc)
y_pred_xgb = grid_xgb.predict(X_validation_enc)
cm_xgb_train = confusion_matrix(y_train_enc, grid_xgb.predict(X_train_enc), labels=None, sample_weight=None)
cm_xgb_test = confusion_matrix(y_validation_enc, y_pred_xgb, labels=None, sample_weight=None)
def create_model(lyrs=[4], act='linear', opt='Adam', dr=0.0):
# set random seed for reproducibility
from numpy.random import seed
seed(42)
model = models.Sequential()
# create first hidden layer
model.add(layers.Dense(lyrs[0], input_dim=X_train.shape[1], activation=act))
# create additional hidden layers
for i in range(1,len(lyrs)):
model.add(layers.Dense(lyrs[i], activation=act))
# create output layers
#model.add(layers.Dense(4, activation='sigmoid')) # output layer
#model.add(layers.Dense(2, activation='relu')) # output layer
model.add(layers.Dense(12, input_dim=8, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid')) # output layer
#model.add(layers.Dense(1, activation='softmax')) # output layer
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
return model
model = create_model()
print(model.summary())
model = KerasClassifier(build_fn=create_model, verbose=0)
# Construct the hyperparameter grid
batch_size = [16, 32]
epochs = [50]
#optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
#activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(batch_size=batch_size, epochs=epochs) #optimizer=optimizer, , activation=activation
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_nn = GridSearchCV(estimator=model,param_grid=param_grid,n_jobs=-1,cv=5,verbose=2)# include n_jobs=-1 if you are using CPU
grid_nn.fit(X_train, y_train)
y_pred_neu = grid_nn.predict(X_validation)
print('Receiver Operating Characteristics:\n')
logit_roc_auc = roc_auc_score(y_validation, y_pred_cls)
dectree_roc_auc = roc_auc_score(y_validation, y_pred_dec_tree)
rand_roc_auc = roc_auc_score(y_validation_enc, y_pred_rf)
grad_roc_auc = roc_auc_score(y_validation_enc, y_pred_gbm)
neural_roc_auc = roc_auc_score(y_validation, y_pred_neu)
xgb_roc_auc = roc_auc_score(y_validation_enc, y_pred_xgb)
fpr1, tpr1, thresholds1 = roc_curve(y_validation, cls.predict_proba(X_validation)[:,1])
fpr2, tpr2, thresholds2 = roc_curve(y_validation, grid_dec.predict_proba(X_validation)[:,1])
fpr3, tpr3, thresholds3 = roc_curve(y_validation_enc, grid_rf.predict_proba(X_validation_enc)[:,1])
fpr4, tpr4, thresholds4 = roc_curve(y_validation_enc, grid_gbm.predict_proba(X_validation_enc)[:,1])
fpr5, tpr5, thresholds5 = roc_curve(y_validation, grid_nn.predict_proba(X_validation)[:,1])
fpr6, tpr6, thresholds6 = roc_curve(y_validation_enc, grid_xgb.predict_proba(X_validation_enc)[:,1])
plt.figure(figsize=(8,5))
plt.plot(fpr1, tpr1, label='Logistic Regression(%0.3f)' % logit_roc_auc)
plt.plot(fpr2, tpr2, label='Decision Tree(%0.3f)' % dectree_roc_auc)
plt.plot(fpr3, tpr3, label='Random Forest(%0.3f)' % rand_roc_auc)
plt.plot(fpr4, tpr4, label='GradientBoosted Tree(%0.3f)' % grad_roc_auc)
plt.plot(fpr5, tpr5, label='Neural Network(%0.3f)' % neural_roc_auc)
plt.plot(fpr6, tpr6, label='XGBoost(%0.3f)' % xgb_roc_auc)
plt.plot([0, 1], [0, 1],'r--',label='Baseline(0.50)')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.suptitle('Receiver Operating Characteristics [ROC]')
plt.title('AUC Scoreboard')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('Linear | Logistic Regression:')
print('\nAccuracy Measures on Train:')
print('Confusion_matrix : \n{}'.format(cm_logit_t))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_train, cls.predict(X_train))))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_train, cls.predict(X_train))))
print('Log Loss : {:5.3f}'.format(log_loss(y_train, cls.predict(X_train))))
print('Classification Report:')
print(classification_report(y_train, cls.predict(X_train)))
print('\nAccuracy Measures on Test:')
print('Confusion_matrix : \n{}'.format(cm_logit_test))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_validation, y_pred_cls)))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_validation, y_pred_cls)))
print('Log Loss : {:5.3f}'.format(log_loss(y_validation, y_pred_cls)))
print('Classification Report:')
print(classification_report(y_validation, y_pred_cls))
print('Tree | Decision Tree Classification:')
print('\nAccuracy Measures on Train:')
print('Confusion_matrix : \n{}'.format(cm_dt_train))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_train, grid_dec.predict(X_train))))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_train, grid_dec.predict(X_train))))
print('Log Loss : {:5.3f}'.format(log_loss(y_train, grid_dec.predict(X_train))))
print('Classification Report:')
print(classification_report(y_train, grid_dec.predict(X_train)))
print('\nAccuracy Measures on Test:')
print('Confusion_matrix : \n{}'.format(cm_dt_test))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_validation, y_pred_dec_tree)))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_validation, y_pred_dec_tree)))
print('Log Loss : {:5.3f}'.format(log_loss(y_validation, y_pred_dec_tree)))
print('Classification Report:')
print(classification_report(y_validation, y_pred_dec_tree))
print('\n')
print('Optimal hyperparameter(s): {}.'.format(dict(grid_dec.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_dec.best_estimator_))
print('Ensemble | Random Forest Classification:')
print('\nAccuracy Measures on Train:')
print('Confusion_matrix : \n{}'.format(cm_rf_train))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_train_enc, grid_rf.predict(X_train_enc))))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_train_enc, grid_rf.predict(X_train_enc))))
print('Log Loss : {:5.3f}'.format(log_loss(y_train_enc, grid_rf.predict(X_train_enc))))
print('Classification Report:')
print(classification_report(y_train_enc, grid_rf.predict(X_train_enc)))
print('\nAccuracy Measures on Test:')
print('Confusion_matrix : \n{}'.format(cm_rf_test))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_validation_enc, y_pred_rf)))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_validation_enc, y_pred_rf)))
print('Log Loss : {:5.3f}'.format(log_loss(y_validation_enc, y_pred_rf)))
print('Classification Report:')
print(classification_report(y_validation_enc, y_pred_rf))
print('\n')
print('Optimal hyperparameter(s): {}.'.format(dict(grid_rf.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_rf.best_estimator_))
print('Ensemble | GradientBoost-Tree Classification:')
print('\nAccuracy Measures on Train:')
print('Confusion_matrix : \n{}'.format(cm_gbm_train))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_train_enc, grid_gbm.predict(X_train_enc))))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_train_enc, grid_gbm.predict(X_train_enc))))
print('Log Loss : {:5.3f}'.format(log_loss(y_train_enc, grid_gbm.predict(X_train_enc))))
print('Classification Report:')
print(classification_report(y_train_enc, grid_gbm.predict(X_train_enc)))
print('\nAccuracy Measures on Test:')
print('Confusion_matrix : \n{}'.format(cm_gbm_test))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_validation_enc, y_pred_gbm)))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_validation_enc, y_pred_gbm)))
print('Log Loss : {:5.3f}'.format(log_loss(y_validation_enc, y_pred_gbm)))
print('Classification Report:')
print(classification_report(y_validation_enc, y_pred_gbm))
print('\n')
print('Optimal hyperparameter(s): {}.'.format(dict(grid_gbm.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_gbm.best_estimator_))
print('Ensemble | XGBoost Classification:')
print('\nAccuracy Measures on Train:')
print('Confusion_matrix : \n{}'.format(cm_xgb_train))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_train_enc, grid_xgb.predict(X_train_enc))))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_train_enc, grid_xgb.predict(X_train_enc))))
print('Log Loss : {:5.3f}'.format(log_loss(y_train_enc, grid_xgb.predict(X_train_enc))))
print('Classification Report:')
print(classification_report(y_train_enc, grid_xgb.predict(X_train_enc)))
print('\nAccuracy Measures on Test:')
print('Confusion_matrix : \n{}'.format(cm_xgb_test))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_validation_enc, y_pred_xgb)))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_validation_enc, y_pred_xgb)))
print('Log Loss : {:5.3f}'.format(log_loss(y_validation_enc, y_pred_xgb)))
print('Classification Report:')
print(classification_report(y_validation_enc, y_pred_xgb))
print('\n')
print('Optimal hyperparameter(s): {}.'.format(dict(grid_xgb.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_xgb.best_estimator_))
print('NeuralNet | Neural Network:')
cm_nn_train = confusion_matrix(y_train, grid_nn.predict(X_train), labels=None, sample_weight=None)
print('\nAccuracy Measures on Train:')
print('Confusion_matrix : \n{}'.format(cm_nn_train))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_train, grid_nn.predict(X_train))))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_train, grid_nn.predict(X_train))))
print('Log Loss : {:5.3f}'.format(log_loss(y_train, grid_nn.predict(X_train))))
print('Classification Report:')
print(classification_report(y_train, grid_nn.predict(X_train)))
cm_nn_test = confusion_matrix(y_validation, y_pred_neu, labels=None, sample_weight=None)
print('\nAccuracy Measures on Test:')
print('Confusion_matrix : \n{}'.format(cm_gbm_test))
print('Accuracy Score : {:5.3f}'.format(accuracy_score(y_validation, y_pred_neu)))
print('AUC Score : {:5.3f}'.format(roc_auc_score(y_validation, y_pred_neu)))
print('Log Loss : {:5.3f}'.format(log_loss(y_validation, y_pred_neu)))
print('Classification Report:')
print(classification_report(y_validation, y_pred_neu))
print('\n')
print('Optimal hyperparameter(s): {}.'.format(dict(grid_nn.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_nn.best_estimator_))
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X_train.drop(['PCA_1_Prt_Mon_Com_Chr', 'PCA_1_DM_Tele', 'PCA_2_Email'],axis=1),y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score'] #naming the dataframe columns
print(featureScores.nlargest(15,'Score')) #print 10 best features
RF = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=7, max_features=10,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=2000,
n_jobs=None, oob_score=True, random_state=24, verbose=0,
warm_start=False)
RF.fit(X_train_enc,y_train_enc)
RF_dffeatures = pd.DataFrame(RF.feature_importances_)
RF_dfcolumns = pd.DataFrame(X_train_enc.columns)
RF_featureScores = pd.concat([RF_dfcolumns, RF_dffeatures], axis=1)
RF_featureScores.columns = ['Features','Importance'] # naming the dataframe columns
print(RF_featureScores.nlargest(15,'Importance')) # provide top features
plot_RF = RF_featureScores.nlargest(15,'Importance').reset_index().drop('index',axis=1).sort_values('Importance', ascending=True)
plt.barh(plot_RF['Features'],plot_RF['Importance'])
plt.title("Top 10 - Features Importance by RandomForest")
plt.ylabel("Features")
plt.xlabel("Importance Scale")
plt.show()
g = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.01, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=3000,
n_iter_no_change=None, presort='deprecated',
random_state=24, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
g.fit(X_train_enc,y_train_enc)
g_dffeatures = pd.DataFrame(g.feature_importances_)
g_dfcolumns = pd.DataFrame(X_train_enc.columns)
g_featureScores = pd.concat([g_dfcolumns, g_dffeatures], axis=1)
g_featureScores.columns = ['Features','Importance'] # naming the dataframe columns
print(g_featureScores.nlargest(15,'Importance')) # provide top features
plot_g = g_featureScores.nlargest(15,'Importance').reset_index().drop('index',axis=1).sort_values('Importance', ascending=True)
plt.barh(plot_g['Features'],plot_g['Importance'])
plt.title("Top 10 - Features Importance by GradientBoosted Tree")
plt.ylabel("Features")
plt.xlabel("Importance Scale")
plt.show()
xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.01, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=3000, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=24,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)
xgc.fit(X_train_enc,y_train_enc)
xgc_dffeatures = pd.DataFrame(xgc.feature_importances_)
xgc_dfcolumns = pd.DataFrame(X_train_enc.columns)
xgc_featureScores = pd.concat([xgc_dfcolumns, xgc_dffeatures], axis=1)
xgc_featureScores.columns = ['Features','Importance'] # naming the dataframe columns
print(xgc_featureScores.nlargest(15,'Importance')) # provide top features
plot_xgc = xgc_featureScores.nlargest(15,'Importance').reset_index().drop('index',axis=1).sort_values('Importance', ascending=True)
plt.barh(plot_xgc['Features'],plot_xgc['Importance'])
plt.title("Top 10 - Features Importance by XGBoost")
plt.ylabel("Features")
plt.xlabel("Importance Scale")
plt.show()
print('Logistic Regression:\n')
coefs = pd.Series(cls.coef_[0],index=X_train.columns)
coefs = coefs.sort_values(ascending=False)
plt.figure(figsize=(15,15))
plt.subplot(1,1,1)
coefs.plot(kind='bar')
plt.show()
print(coefs.sort_values(ascending=False))
import xgboost as xgb
fig = plt.figure(figsize = (16, 12))
title = fig.suptitle("Default Feature Importances from XGBoost", fontsize=14)
ax1 = fig.add_subplot(2,2,1)
xgb.plot_importance(xgc, importance_type='weight', ax=ax1)
t=ax1.set_title("Feature Importance - Feature Weight")
ax2 = fig.add_subplot(2,2,2)
xgb.plot_importance(xgc, importance_type='gain', ax=ax2)
t=ax2.set_title("Feature Importance - Split Mean Gain")
ax3 = fig.add_subplot(2,2,3)
xgb.plot_importance(xgc, importance_type='cover', ax=ax3)
t=ax3.set_title("Feature Importance - Sample Coverage")
import shap
explainer = shap.TreeExplainer(xgc)
shap_values = explainer.shap_values(X_validation_disp.drop(['LastTransactionChannel','NumberofEmployees','Languager'],axis=1))
print('Expected Value:', explainer.expected_value)
pd.DataFrame(shap_values).head()
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0,:], X_validation_disp.drop(['LastTransactionChannel','NumberofEmployees','Languager'],axis=1).iloc[0,:])
shap.force_plot(explainer.expected_value,shap_values[2,:], X_validation_disp.drop(['LastTransactionChannel','NumberofEmployees','Languager'],axis=1).iloc[2,:])
shap.force_plot(explainer.expected_value, shap_values[:7000,:], X_validation_enc.iloc[:7000,:])
shap.summary_plot(shap_values, X_validation_enc, plot_type="bar")
def ABS_SHAP(df_shap,df):
#import matplotlib as plt
# Make a copy of the input data
shap_v = pd.DataFrame(df_shap)
feature_list = df.columns
shap_v.columns = feature_list
df_v = df.copy().reset_index().drop('index',axis=1)
# Determine the correlation in order to plot with different colors
corr_list = list()
for i in feature_list:
b = np.corrcoef(shap_v[i],df_v[i])[1][0]
corr_list.append(b)
corr_df = pd.concat([pd.Series(feature_list),pd.Series(corr_list)],axis=1).fillna(0)
# Make a data frame. Column 1 is the feature, and Column 2 is the correlation coefficient
corr_df.columns = ['Variable','Corr']
corr_df['Sign'] = np.where(corr_df['Corr']>0,'red','blue')
# Plot it
shap_abs = np.abs(shap_v)
k=pd.DataFrame(shap_abs.mean()).reset_index()
k.columns = ['Variable','SHAP_abs']
k2 = k.merge(corr_df,left_on = 'Variable',right_on='Variable',how='inner')
k2 = k2.sort_values(by='SHAP_abs',ascending = True)
colorlist = k2['Sign']
ax = k2.plot.barh(x='Variable',y='SHAP_abs',color = colorlist, figsize=(5,6),legend=False)
ax.set_xlabel("SHAP Value (Red = Positive Impact)")
ABS_SHAP(shap_values,X_validation_enc)
shap.summary_plot(shap_values, X_validation_enc)
shap.dependence_plot(ind='TenureYrs', interaction_index='TenureYrs',
shap_values=shap_values,
features=X_validation_enc,
display_features=X_validation_disp)
shap.dependence_plot(ind='Number_of_Prior_Year_Transactions', interaction_index='Number_of_Prior_Year_Transactions',
shap_values=shap_values,
features=X_validation_enc,
display_features=X_validation_disp)
shap.dependence_plot(ind='Historical_Sales_Volume', interaction_index='Historical_Sales_Volume',
shap_values=shap_values,
features=X_validation_enc,
display_features=X_validation_disp)
shap.dependence_plot(ind='PCA_1_Prt_Mon_Com_Chr', interaction_index='PCA_1_Prt_Mon_Com_Chr',
shap_values=shap_values,
features=X_validation_enc,
display_features=X_validation_enc)
shap.dependence_plot(ind='PCA_1_DM_Tele', interaction_index='PCA_1_DM_Tele',
shap_values=shap_values,
features=X_validation_enc,
display_features=X_validation_enc)
a = X_validation_enc
b = decode(X_validation_enc)
b.rename(columns=lambda col: col.replace('_', ''), inplace=True)
b.rename(columns={'Language': 'Languager'}, inplace=True)
b.columns
X_validation_disp = pd.concat([a, b],axis = 1)
X_validation_disp.columns
shap.dependence_plot(ind='Last_Transaction_Channel', interaction_index='Last_Transaction_Channel',
shap_values=shap_values,
features=X_validation_enc,
display_features=X_validation_enc)
print(X_validation_disp.groupby(['Last_Transaction_Channel','LastTransactionChannel']).size())
shap.dependence_plot(ind='Number_of_Employees', interaction_index='Number_of_Employees',
shap_values=shap_values,
features=X_validation_enc,
display_features=X_validation_enc)
print(X_validation_disp.groupby(['Number_of_Employees','NumberofEmployees']).size())
shap.dependence_plot(ind='Language', interaction_index='Language',
shap_values=shap_values,
features=X_validation_enc,
display_features=X_validation_enc)
print(X_validation_disp.groupby(['Language','Languager']).size())
shap.dependence_plot(ind='Historical_Sales_Volume', interaction_index='Number_of_Prior_Year_Transactions',
shap_values=shap_values, features=X_validation_enc,
display_features=X_validation_enc)
shap.dependence_plot(ind='TenureYrs', interaction_index='Historical_Sales_Volume',
shap_values=shap_values, features=X_validation_enc,
display_features=X_validation_enc)
shap.dependence_plot(ind='Number_of_Prior_Year_Transactions', interaction_index='Historical_Sales_Volume',
shap_values=shap_values, features=X_validation_enc,
display_features=X_validation_enc)
file_path = '../PGDADS_Capstone Assignment/model_df.csv'
model_df.to_csv(file_path)
print('Data exported successfully!')
model_df.info(verbose=False)
xgc.predict_proba(X_encode)[:,1]
predict_prob = pd.DataFrame(data=xgc.predict_proba(X_encode)[:,1],columns=['Est_Prob'])
predict_prob.tail(3)
proba_df = pd.concat([model_df.reset_index(), predict_prob],axis = 1)
proba_df.tail(10)
proba_df.info()
file_path = '../PGDADS_Capstone Assignment/propensitymodel.csv'
proba_df.to_csv(file_path)
print('Data exported successfully!')
proba_df.rename(columns={'level_0': 'Aft_Propensity_Index','index': 'Original_Index'}, inplace=True)
regress_df = proba_df.copy()
regress_df['log_act_y'] = np.log(regress_df['Campaign_Period_Sales'])
#regress_df['log_HistSales'] = np.log2(regress_df['Historical_Sales_Volume'])
#regress_df['inv_PYTran'] = 1/np.exp(regress_df['Number_of_Prior_Year_Transactions'])
regress_df['Avg_Sales_PU'] = regress_df['Historical_Sales_Volume']/regress_df['Number_of_Prior_Year_Transactions']
regress_df.info()
list_to_remove2 = ['log_act_y','Aft_Propensity_Index','Customer_Number','Contact_Channel','Repurchase_Method','Original_Index','Do_Not_Direct_Mail_Solicit','Do_Not_Email','Do_Not_Telemarket','Response','Date_of_First_Purchase','Campaign_Period_Sales','Standard_Chair','Monitor','Printer','Computer','ProductMix']
print('Model Dataset if using Option 1:')
print(regress_df.drop(list_to_remove2, axis=1).info())
regress_df_resp = regress_df[regress_df['Response']==1]
regress_df_resp.info()
X_model_df = regress_df_resp.drop(list_to_remove2,axis=1)
y_model_df = regress_df_resp['log_act_y']
corr_reg= regress_df_resp.select_dtypes(exclude='object').corr()
plt.figure(figsize=(18,15))
sns.heatmap(corr_reg, annot=True, cmap='coolwarm')
sns.despine()
# Apply train, test split
print(X_model_df.shape, y_model_df.shape)
print('So in X dataframe, there are {} independent variables and in Y dataframe,\nonly 1 target variable.'.format(X_model_df.shape[1], 1))
print('Also there are {} number of observations.'.format(X_model_df.shape[0]))
X_model_df_enc = X_model_df.copy()
for col in X_model_df_enc.select_dtypes(include='object').columns:
encode(X_model_df_enc[col])
X_model_df_enc.info()
X_model_df_OHE = pd.get_dummies(X_model_df)
X_model_df_OHE.info()
X_train_enc2, X_validation_enc2, y_train_enc2, y_validation_enc2 = train_test_split(X_model_df_enc,y_model_df,test_size = 0.50,random_state = 42)
print('Ordinal Encoder Training examples: {}'.format(X_train_enc2.shape[0]))
print('So in X_train dataframe, there are {} independent variables and only 1 target variable.'.format(X_train_enc2.shape[1],+1))
print('Also there are {} number of observations.'.format(X_train_enc2.shape[0]))
print('==' * 50)
print('Ordinal Encoder Validation examples: {}'.format(X_validation_enc2.shape[0]))
print('So in X_validation dataframe, there are {} independent variables and only 1 target variable.'.format(X_validation_enc2.shape[1],+1))
print('Also there are {} number of observations.'.format(X_validation_enc2.shape[0]))
X_train_ohe, X_validation_ohe, y_train_ohe, y_validation_ohe = train_test_split(X_model_df_OHE,y_model_df,test_size = 0.50,random_state = 42)
print('Ordinal Encoder Training examples: {}'.format(X_train_ohe.shape[0]))
print('So in X_train dataframe, there are {} independent variables and only 1 target variable.'.format(X_train_ohe.shape[1],+1))
print('Also there are {} number of observations.'.format(X_train_ohe.shape[0]))
print('==' * 50)
print('Ordinal Encoder Validation examples: {}'.format(X_validation_ohe.shape[0]))
print('So in X_validation dataframe, there are {} independent variables and only 1 target variable.'.format(X_validation_ohe.shape[1],+1))
print('Also there are {} number of observations.'.format(X_validation_ohe.shape[0]))
linear_mod = LinearRegression()
# Construct the hyperparameter grid
param_grid = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_linear = GridSearchCV(linear_mod, param_grid=param_grid, cv=5)
grid_linear.fit(X_train_ohe, y_train_ohe)
y_pred_linear = grid_linear.predict(X_validation_ohe)
print('R2 Score :', round(r2_score(y_validation_ohe, y_pred_linear),4))
print('MSE :', round(metrics.mean_squared_error(y_validation_ohe, y_pred_linear),4))
print('RMSE :', round(np.sqrt(metrics.mean_squared_error(y_validation_ohe, y_pred_linear)),4))
print('Optimal hyperparameter(s): {}.'.format(dict(grid_linear.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_linear.best_estimator_))
rr = Ridge(random_state=24)
# Construct the hyperparameter grid
param_grid = {'alpha': [0.0001, 0.001, 0.01, 0.02, 0.1, 1, 10, 100, 1000],
'fit_intercept':[True,False],'normalize':[True,False], 'copy_X':[True, False],
'max_iter' :[100,1000,10e3,10e5]}
#higher the alpha value, more restriction on the coefficients;
#low alpha > more generalization, coefficients are barely
#Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_ridge = GridSearchCV(rr, param_grid=param_grid, cv=5)
grid_ridge.fit(X_train_ohe, y_train_ohe)
y_pred_rr = grid_ridge.predict(X_validation_ohe)
print('R2 Score :', round(r2_score(y_validation_ohe, y_pred_rr),4))
print('MSE :', round(metrics.mean_squared_error(y_validation_ohe, y_pred_rr),4))
print('RMSE :', round(np.sqrt(metrics.mean_squared_error(y_validation_ohe, y_pred_rr)),4))
print('Optimal hyperparameter(s): {}.'.format(dict(grid_ridge.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_ridge.best_estimator_))
elastic = ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True,
l1_ratio=0.30000000000000004, max_iter=10, normalize=False,
positive=False, precompute=False, random_state=24,
selection='cyclic', tol=0.0001, warm_start=False)
# Construct the hyperparameter grid
param_grid = {#'alpha': [0.0001, 0.001, 0.01, 0.02, 0.1, 1, 10, 100, 1000],
# 'l1_ratio': np.arange(0.0, 1.0, 0.1),
# 'max_iter' :[10, 100, 1000]
}
#Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_elastic= GridSearchCV(elastic, param_grid=param_grid, cv=5)
grid_elastic.fit(X_train_ohe, y_train_ohe)
y_pred_elastic = grid_elastic.predict(X_validation_ohe)
print('R2 Score :', round(r2_score(y_validation_ohe, y_pred_elastic),4))
print('MSE :', round(metrics.mean_squared_error(y_validation_ohe, y_pred_elastic),4))
print('RMSE :', round(np.sqrt(metrics.mean_squared_error(y_validation_ohe, y_pred_elastic)),4))
print('Optimal hyperparameter(s): {}.'.format(dict(grid_elastic.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_elastic.best_estimator_))
rf = RandomForestRegressor(random_state=24)
# Construct the hyperparameter grid
param_grid = {'n_estimators':[1000, 3000],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth' : [3,5,7],
'criterion' : ['mse', 'entropy'],
'oob_score' :[True, False]
}
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_rand = GridSearchCV(rf, param_grid=param_grid, cv=5)
grid_rand.fit(X_train_enc2, y_train_enc2)
y_pred_rand = grid_rand.predict(X_validation_enc2)
print('R2 Score :', round(r2_score(y_validation_enc2, y_pred_rand),4))
print('MSE :', round(metrics.mean_squared_error(y_validation_enc2, y_pred_rand),4))
print('RMSE :', round(np.sqrt(metrics.mean_squared_error(y_validation_enc2, y_pred_rand)),4))
print('Optimal hyperparameter(s): {}.'.format(dict(grid_rand.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_rand.best_estimator_))
gbm = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
init=None, learning_rate=0.01, loss='ls', max_depth=5,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=1000,
n_iter_no_change=None, presort='deprecated',
random_state=24, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0, warm_start=False)
# Construct the hyperparameter grid
param_grid = {#'n_estimators': [1000],#, 2000, 3000],
#'max_features': ['auto', 'sqrt', 'log2'],
#'criterion' :['friedman_mse'],#'gini', 'entropy'],
#'max_depth' : [5],#[3,5,7,8],
#'learning_rate' : [0.005,0.01]#[0.01,0.02,0.03,0.05]
}
# Instantiate the GridSearchCV object using the estimator object,
# the hyperparameter grid, & 5-fold cross-validation
grid_gbmr = GridSearchCV(gbm, param_grid=param_grid, cv=5)
grid_gbmr.fit(X_train_enc2, y_train_enc2)
y_pred_gbmr = grid_gbmr.predict(X_validation_enc2)
print('R2 Score :', round(r2_score(y_validation_enc2, y_pred_gbmr),4))
print('MSE :', round(metrics.mean_squared_error(y_validation_enc2, y_pred_gbmr),4))
print('RMSE :', round(np.sqrt(metrics.mean_squared_error(y_validation_enc2, y_pred_gbmr)),4))
print('Optimal hyperparameter(s): {}.'.format(dict(grid_gbmr.best_params_)))
print('Optimal Estimator:\n{}'.format(grid_gbmr.best_estimator_))
xgbr = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
importance_type='gain', learning_rate=0.02, max_delta_step=0,
max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
n_jobs=1, nthread=None, objective='reg:linear', random_state=24,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)
xgbr.fit(X_train_enc2, y_train_enc2)
y_pred_gbm_regr = xgbr.predict(X_validation_enc2)
print('R2 Score :', round(r2_score(y_validation_enc2, y_pred_gbm_regr),4))
print('MSE :', round(metrics.mean_squared_error(y_validation_enc2, y_pred_gbm_regr),4))
print('RMSE :', round(np.sqrt(metrics.mean_squared_error(y_validation_enc2, y_pred_gbm_regr)),4))
xgbr_dffeatures = pd.DataFrame(xgbr.feature_importances_)
xgbr_dfcolumns = pd.DataFrame(X_train_enc2.columns)
xgbr_featureScores = pd.concat([xgbr_dfcolumns, xgbr_dffeatures], axis=1)
xgbr_featureScores.columns = ['Features','Importance'] # naming the dataframe columns
print(xgbr_featureScores.nlargest(10,'Importance')) # provide top features
fig = plt.figure(figsize = (16, 12))
title = fig.suptitle("Default Feature Importances from XGBoost", fontsize=14)
ax1 = fig.add_subplot(2,2,1)
xgb.plot_importance(xgbr, importance_type='weight', ax=ax1)
t=ax1.set_title("Feature Importance - Feature Weight")
ax2 = fig.add_subplot(2,2,2)
xgb.plot_importance(xgbr, importance_type='gain', ax=ax2)
t=ax2.set_title("Feature Importance - Split Mean Gain")
ax3 = fig.add_subplot(2,2,3)
xgb.plot_importance(xgbr, importance_type='cover', ax=ax3)
t=ax3.set_title("Feature Importance - Sample Coverage")
import shap
explainer = shap.TreeExplainer(xgbr)
shap_values = explainer.shap_values(X_validation_enc2)
print('Expected Value:', explainer.expected_value)
pd.DataFrame(shap_values).head()
shap.initjs()
shap.force_plot(explainer.expected_value,shap_values[0,:], X_validation_enc2.iloc[0,:])
shap.force_plot(explainer.expected_value,shap_values[20,:], X_validation_enc2.iloc[20,:])
shap.force_plot(explainer.expected_value, shap_values[:2000,:], X_validation_enc.iloc[:2000,:])
ABS_SHAP(shap_values,X_validation_enc2)
shap.summary_plot(shap_values, X_validation_enc2)
regress_df_enc = regress_df.copy()
for col in regress_df_enc.select_dtypes(include='object').columns:
encode(regress_df_enc[col])
X_train_enc2.info()
regress_df_enc.drop(list_to_remove2,axis=1).info()
proba_df.info()
regress_df_enc[regress_df_enc['Response']==1].info()
regress_est = grid_gbmr.predict(regress_df_enc[regress_df_enc['Response']==1].drop(list_to_remove2,axis=1))
regress_est_log = pd.DataFrame(data=regress_est,columns=['log_hats_y'])
regress_est_log['Est_Sale']=np.exp(regress_est_log['log_hats_y'])
regress_est_log
regress_df_resp
regress_df = pd.concat([regress_df_resp.reset_index(), regress_est_log],axis = 1)
regress_df.head(10)
regress_df.rename(columns={'index':'Aft_Regression_Index'})
regress_df.info()
small_df = regress_df[['Customer_Number','avg_sales','Est_Prob','log_act_y','log_hats_y','Est_Sale']]
proba_df.info()
final_df_regression = proba_df.merge(small_df, left_on='Customer_Number', right_on='Customer_Number', how='left')
final_df_regression.info()
final_df_regression
final_df_regression[final_df_regression['Est_Sale'].isna()]
final_df_regression['avg_sales'] = np.where(final_df_regression['avg_sales'].isna(),0.00,final_df_regression['avg_sales'])
final_df_regression['Est_Prob_y'] = np.where(final_df_regression['Est_Prob_y'].isna(),0.00,final_df_regression['Est_Prob_y'])
final_df_regression['log_act_y'] = np.where(final_df_regression['log_act_y'].isna(),0.00,final_df_regression['log_act_y'])
final_df_regression['log_hats_y'] = np.where(final_df_regression['log_hats_y'].isna(),0.00,final_df_regression['log_hats_y'])
final_df_regression['Est_Sale'] = np.where(final_df_regression['Est_Sale'].isna(),0.00,final_df_regression['Est_Sale'])
final_df_regression
X_batch1, X_batch2 = train_test_split(final_df_regression, stratify=y, test_size = 0.50, random_state = 42)
print(X_batch1.shape, X_batch2.shape)
file_path = '../PGDADS_Capstone Assignment/GBTregressionmodel.csv'
final_df_regression.to_csv(file_path)
print('Data exported successfully!')
file_path = '../PGDADS_Capstone Assignment/X_batch1regressionmodel.csv'
X_batch1.to_csv(file_path)
print('Data exported successfully!')
file_path = '../PGDADS_Capstone Assignment/X_batch2regressionmodel.csv'
X_batch2.to_csv(file_path)
print('Data exported successfully!')